In [1]:
    
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
    
    Out[1]:
In [2]:
    
import pandas as pd
import numpy as np
import cPickle as pickle
import json
    
In [12]:
    
jsonpath = '/Users/rcn/Desktop/twitter-analysis/data/raw/tweets.json'
    
In [21]:
    
tweets = pd.read_json(jsonpath, orient='records')
    
In [20]:
    
tweets = pd.io.json.json_normalize(jsonpath)
    
    
In [15]:
    
print('We have %d tweets in total' % len(tweets))
    
    
In [22]:
    
twitterData = pd.DataFrame(tweets)
    
In [23]:
    
twitterData.head()
    
    Out[23]:
In [24]:
    
text =
    
    Out[24]:
In [18]:
    
twitterData.dtypes
    
    Out[18]:
In [10]:
    
#twitterData.twitter_mentions=
#twitterData[30:45].twitter_mentions.str.split(',').astype(list).astype('str')
#twitterData.twitter_mentions_list=twitterData.twitter_mentions.str.split(',').astype(list).astype('str')
#twitterData.twitter_mentions_list=twitterData.twitter_mentions.apply(lambda x: list(str(x).split(',')))
#twitterData.twitter_mentions_list[40:45].get_values()
twitterData.entities.user_mentions[40:45].get_values()
    
    
In [10]:
    
pd.Series.get_values
    
    Out[10]:
In [8]:
    
(twitterData.twitter_mentions_list[44:45]).get_values()[0][0]
    
    Out[8]:
In [9]:
    
twitterData.describe()
    
    Out[9]:
In [10]:
    
nTweets = len(twitterData.index)
print "There are", nTweets, "tweets in the full dataset"
    
    
In [11]:
    
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
    
In [9]:
    
#twitterData['Friends'].plot()
    
In [12]:
    
from bokeh.plotting import *
output_notebook()
    
    
In [13]:
    
tweets = twitterData.id.count()
tweets
    
    Out[13]:
In [14]:
    
from ggplot import *
%matplotlib inline
    
In [18]:
    
figure(
    title='Number of Tweets',        # Plot title
    title_text_font='Courier New',  # Title font
    title_text_color='#5d6263',     # Title font colour
    plot_width=1000,                # Plot width
    plot_height=600,                # Plot height
    background_fill='#f6f6f6',      # Background colour
    border_fill='#f6f6f6',          # Border background
    border_symmetry='hv',           # h=horizontal, v=vertical
    outline_line_color='#f6f6f6',   # Plot area border colour
    x_axis_type = 'datetime',       # For timeseries only
    tools='pan,box_zoom,previewsave,resize,select,reset' # Available: pan,wheel_zoom,box_zoom,previewsave,resize,select,reset
)
hold()
line(
    twitterData.time,             # x
    twitterData.id,                 # y
    color='#00aeef',                # Line colour
    line_width=3,                   # Line width in px
    legend='Tweets',                 # Legend label
)
legend().label_text_font='Courier New'
legend().label_text_color='#5d6263'
legend().label_outline_line_color='#f6f6f6'
yaxis().axis_line_color = None
xaxis().axis_line_color = '#d4d4d4'
axis().major_label_text_font="Courier New"
axis().major_label_text_font_size="12pt"
xgrid().grid_line_color = None
ygrid().grid_line_color = "#d4d4d4"
ygrid().grid_line_width = 1
show()
    
    
In [22]:
    
output_file("friends.html", title="timeseries example")
hold()
line(
    twitterData['time'],                                       # x coordinates
    twitterData['friends'],                                  # y coordinates
    color='#A6CEE3',                                    # set a color for the line
    legend='Friends',                                      # attach a legend label
    x_axis_type = "datetime",                           # NOTE: only needed on first
    tools="pan,wheel_zoom,box_zoom,reset,previewsave"   # NOTE: only needed on first
)
    
    Out[22]:
In [23]:
    
x = twitterData['time']
y = twitterData.cumsum()
line(x,y, color="#0000FF", tools=[])
show()
    
    
In [29]:
    
# Languages included in taxonomy: en, hi, ur, sw, ha, ig, yo
nDataSiftType = twitterData.type.value_counts(normalize=True, sort=True, ascending=False, bins=None)
nDataSiftType[0:10]
    
    Out[29]:
In [25]:
    
# Languages included in taxonomy: en, hi, ur, sw, ha, ig, yo
nDataSiftLanguage = twitterData.datasift_lang.value_counts(normalize=True, sort=True, ascending=False, bins=None)
nDataSiftLanguage[0:10]
    
    Out[25]:
In [26]:
    
nTwitterLanguage = twitterData.twitter_lang.value_counts(normalize=True, sort=True, ascending=False, bins=None)
nTwitterLanguage[0:10]
    
    Out[26]:
In [27]:
    
# Reminder: Locations we are interested in are "IN", "PK", "NG", and "KE".
nLocation = twitterData.twitter_location.value_counts(normalize=False, sort=True, ascending=False, bins=None)
nLocation[0:15]
    
    Out[27]:
In [28]:
    
# Reminder: Locations we are interested in are "IN", "PK", "NG", and "KE".
nUngpLocation = twitterData.UNGP_location.value_counts(normalize=False, sort=True, ascending=False, bins=None)
nUngpLocation[0:15]
    
    Out[28]:
In [37]:
    
# Getting Vincent ready 
vincent.initialize_notebook()
gpBlue='#00aeef'
gpLightGray='#96999b'
gpDarkBlue='#00447c'
gpRed='#cf5c42'
gpBrown='#e1d8ad'
gpPink='#f4d5e3'
gpLightBlue='#e1f4fd'
    
    
In [39]:
    
location_grouped = twitterData.groupby('UNGPLocation')
mean_location_grouped = location_grouped.mean().dropna()
mean_followers = mean_location_grouped.sort('Followers')['Followers']
followersBar = vincent.Bar(mean_followers)
followersBar.axis_titles(x='Country', y='Followers')
from vincent.axes import AxisProperties
from vincent.properties import PropertySet
from vincent.values import ValueRef
for axis in followersBar.axes:
    axis.properties = AxisProperties()
    for prop in ['ticks', 'axis', 'major_ticks', 'minor_ticks']:
        setattr(axis.properties, prop, PropertySet(stroke=ValueRef(value=gpLightGray)))
    axis.properties.title = PropertySet(font_size=ValueRef(value=20), 
                                        fill=ValueRef(value=gpLightGray))
    axis.properties.labels = PropertySet(fill=ValueRef(value=gpLightGray))
followersBar.axes[0].properties.labels.angle = ValueRef(value=0)
followersBar.axes[0].properties.labels.align = ValueRef(value='center')
followersBar.axes[0].properties.title.dy = ValueRef(value=20)
followersBar.scales[2].range = [gpBlue]
followersBar.to_json('../charts/followersBar.json')
followersBar
    
    Out[39]:
In [40]:
    
location_grouped = twitterData.groupby('UNGPLocation')
mean_location_grouped = location_grouped.mean().dropna()
mean_friends = mean_location_grouped.sort('Friends')['Friends']
friendsBar = vincent.Bar(mean_friends)
friendsBar.axis_titles(x='Country', y='Friends')
for axis in friendsBar.axes:
    axis.properties = AxisProperties()
    for prop in ['ticks', 'axis', 'major_ticks', 'minor_ticks']:
        setattr(axis.properties, prop, PropertySet(stroke=ValueRef(value=gpLightGray)))
    axis.properties.title = PropertySet(font_size=ValueRef(value=20), 
                                        fill=ValueRef(value=gpLightGray))
    axis.properties.labels = PropertySet(fill=ValueRef(value=gpLightGray))
friendsBar.axes[0].properties.labels.angle = ValueRef(value=0)
friendsBar.axes[0].properties.labels.align = ValueRef(value='center')
friendsBar.axes[0].properties.title.dy = ValueRef(value=20)
friendsBar.scales[2].range = [gpDarkBlue]
friendsBar.to_json('../charts/friendsBar.json')
friendsBar
    
    Out[40]:
In [41]:
    
location_grouped = twitterData.groupby('UNGPLocation')
mean_location_grouped = location_grouped.mean().dropna()
mean_genderProb = mean_location_grouped.sort('UNGPGenderProb')['UNGPGenderProb']
genderProb = vincent.Bar(mean_genderProb)
genderProb.axis_titles(x='Country', y='Average Gender Probablility')
for axis in genderProb.axes:
    axis.properties = AxisProperties()
    for prop in ['ticks', 'axis', 'major_ticks', 'minor_ticks']:
        setattr(axis.properties, prop, PropertySet(stroke=ValueRef(value=gpLightGray)))
    axis.properties.title = PropertySet(font_size=ValueRef(value=20), 
                                        fill=ValueRef(value=gpLightGray))
    axis.properties.labels = PropertySet(fill=ValueRef(value=gpLightGray))
genderProb.axes[0].properties.labels.angle = ValueRef(value=0)
genderProb.axes[0].properties.labels.align = ValueRef(value='center')
genderProb.axes[0].properties.title.dy = ValueRef(value=20)
genderProb.scales[2].range = [gpRed]
genderProb.to_json('../charts/genderProbBar.json')
genderProb
    
    Out[41]:
In [42]:
    
mpld3.enable_notebook()
gatesCountry = twitterData.UNGPLocation.value_counts(normalize=False, sort=True, ascending=False, bins=None)
gatesCountryFig = gatesCountry.plot(kind='barh', color='#00aeef')
mpld3.display()
    
    
    
In [70]:
    
import ggplot as gg
(ggplot(gg.aes(x='UNGPLocation'), data=twitterData)
+ gg.geom_bar() + gg.ggtitle("Gates Tweets")
+ gg.labs("Country", "Number of tweets"))
    
    
In [77]:
    
languagePlot = ggplot(aes(x='DataSiftLanguage'), data=twitterData) + geom_bar() + ggtitle("Language Distribution") + labs("Language", "Number of tweets")
languagePlot
    
    
In [1]:
    
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
    
    Out[1]: